import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import scipy.stats as stat
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, binarize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, auc, make_scorer
from os import system
We fetch the data and store it in a data frame. Also, we store the attribute description and the dataset itself into an HDF Store and retrieve it, so we may be able to get the column description as needed
bank_df = pd.read_csv("bank-full.csv")
bank_df.col_description = {
"age": "numeric",
"job": "type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')",
"marital": "marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)",
"education": "(categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')",
"default": "has credit in default? (categorical: 'no','yes','unknown')",
"balance": "average yearly balance, in euros (numeric)",
"housing": "has housing loan? (categorical: 'no','yes','unknown')",
"loan": "has personal loan? (categorical: 'no','yes','unknown')",
"contact": "contact communication type (categorical: 'cellular','telephone')",
"day": "last contact day of the month (numeric 1 -31)",
"month": "last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')",
"duration": "last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.",
"campaign": "number of contacts performed during this campaign and for this client (numeric, includes last contact)",
"pdays": "number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)",
"previous": "number of contacts performed before this campaign and for this client (numeric)",
"poutcome": "outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')",
"target":"has the client subscribed a term deposit? (binary: yes,no)"
}
with pd.HDFStore("Bank data.h5") as bank_store:
bank_store.put("df", bank_df)
bank_store.get_storer("df").attrs.col_description = bank_df.col_description
with pd.HDFStore("Bank data.h5") as bank_store:
bank_df = bank_store.get("df")
bank_df.col_description = bank_store.get_storer("df").attrs.col_description
bank_df.head()
List the descriptions stored for the column
bank_df.col_description
We perform two levels of EDA. The first is the broad based univariate and bivariate analysis to display the overall trends and the second answers the individual sub sections under this question
Describe the columns
bank_df.describe()
From the looks of the data set, we see
Listing the info of the dataset
bank_df.info()
The data is consistent and does not have nulls that need to be addressed
Run through the columns to view unique values and see if there is any column that may need to be dropped due to a very large set of values
for i in bank_df.columns:
print( i , ":" , bank_df[i].nunique())
The number of distinct values in each of the columns also look appropriate. What seems to be a question is whether we should consider the "day", "month" features. Do the contact day/month hold any influence in determining the yes/no outcome of a contact
Plotting a histogram for numerical values within the data frame
plt.figure(figsize = (10,10))
bank_df.drop("Target", axis = 1).hist(stacked = False, figsize = (15,50), layout = (8,2))
Most of the distributions seem to be right skewed, except day. This makes sense as the contacts should generally be evenly distributed across a month.
Viewing the data before plotting it
bank_df.head()
Converting the target variable into 0s and 1s
bank_df = bank_df.replace({"Target": {"yes":1, "no":0}})
sns.distplot(bank_df["age"],bins = 20)
sns.boxplot(bank_df["age"])
as described previously, age seems to have outliers towards the right. The median age is around 39
plt.figure(figsize = (15,5))
sns.countplot(x = "job", data = bank_df)
Most of the jobs are management, technician and blue-collar. The least represented groups are students
sns.countplot(x = "marital", data = bank_df)
print("Divorced:", bank_df[bank_df["marital"] == "divorced"]["marital"].count() / bank_df["marital"].count())
print("Single:",bank_df[bank_df["marital"] == "single"]["marital"].count() / bank_df["marital"].count())
print("Married:",bank_df[bank_df["marital"] == "married"]["marital"].count() / bank_df["marital"].count())
Married people make up more than half the population, divorced around 11% and single people approximately over 28%
sns.countplot(x = "education", data = bank_df)
print("Primary:", bank_df[bank_df["education"] == "primary"]["education"].count() / bank_df["education"].count())
print("Secondary:",bank_df[bank_df["education"] == "secondary"]["education"].count() / bank_df["education"].count())
print("Tertiary:",bank_df[bank_df["education"] == "tertiary"]["education"].count() / bank_df["education"].count())
primary educated people are the smallest group(15%), over half the people are secondary educated, 29% people have tertiary educations. There is also a small group of unknowns
sns.countplot(x = "default", data = bank_df)
print("Yes:", bank_df[bank_df["default"] == "yes"]["default"].count() / bank_df["default"].count())
print("No:",bank_df[bank_df["default"] == "no"]["default"].count() / bank_df["default"].count())
Just under 2% of people defaulted
sns.distplot(bank_df["balance"], bins = 10, kde = False)
Bulk of the account balances lie under 30000
plt.figure(figsize = (20,5))
sns.boxplot(bank_df["balance"])
There is a large number of outliers in the balance column, however there are very few outside 50000 and can be eliminated if needed
sns.countplot(x = "housing", data = bank_df)
print("Yes:", bank_df[bank_df["housing"] == "yes"]["housing"].count() / bank_df["housing"].count())
print("No:",bank_df[bank_df["housing"] == "no"]["housing"].count() / bank_df["housing"].count())
55% of people in the dataset have housing loans
sns.countplot(x = "loan", data = bank_df)
print("Yes:", bank_df[bank_df["loan"] == "yes"]["loan"].count() / bank_df["loan"].count())
print("No:",bank_df[bank_df["loan"] == "no"]["loan"].count() / bank_df["loan"].count())
Around 16% of the people are personal loan customers
sns.countplot(x = "contact", data = bank_df)
print("Cellular:", bank_df[bank_df["contact"] == "cellular"]["contact"].count() / bank_df["contact"].count())
print("Telephone:",bank_df[bank_df["contact"] == "telephone"]["contact"].count() / bank_df["contact"].count())
6% people use telephone, around 65% use cellular, there is a considerable number of unknowns and they will have to be treated
plt.figure(figsize = (20,5))
sns.countplot(x = "day", data = bank_df)
There does not seem to be a clear pattern in the contacts on different days. The data seems to have a normal distribution overall if put together
plt.figure(figsize = (20,5))
sns.countplot(x = "month", data = bank_df)
There seems to be a large number of contacts starting in the new financial year for around 5 months. April, may, june, july, august seem to have the highest activity.
plt.figure(figsize = (10,5))
sns.distplot(bank_df["duration"], bins = 20)
bank_df[bank_df["duration"]>900]["duration"].count()/bank_df["duration"].count()
Duration is strongly right tailed, there are a small percentage(~3%) of calls that last over 15 minutes.
plt.figure(figsize = (10,5))
sns.distplot(bank_df["campaign"], bins = 20 , kde = False)
bank_df[bank_df["campaign"]>10]["campaign"].count()/bank_df["campaign"].count()
Around 2.6% of the values are > 10 contacts in the campaign
plt.figure(figsize = (10,5))
sns.distplot(bank_df["pdays"], bins = 20 , kde = False)
bank_df[bank_df["pdays"]>100]["pdays"].count()/bank_df["pdays"].count()
Most Pday occurences are -1, however, there is a considerable chunk where Pday occurences are larger and in the hundreds (15%)
plt.figure(figsize = (10,5))
sns.distplot(bank_df["previous"], bins = 30 , kde = False)
print(bank_df[bank_df["previous"]>100]["previous"].count()/bank_df["previous"].count())
There is a very small percentage of rows with previous > 100, (just 1 row) these are clearly outliers. We can remove them in the data cleaning steps
plt.figure(figsize = (10,5))
sns.countplot(x = "poutcome", data = bank_df)
print("Unknown:", bank_df[bank_df["poutcome"] == "unknown"]["poutcome"].count() / bank_df["poutcome"].count())
print("Other:", bank_df[bank_df["poutcome"] == "other"]["poutcome"].count() / bank_df["poutcome"].count())
print("Failure:",bank_df[bank_df["poutcome"] == "failure"]["poutcome"].count() / bank_df["poutcome"].count())
print("Success:",bank_df[bank_df["poutcome"] == "success"]["poutcome"].count() / bank_df["poutcome"].count())
This column seems to have only 3% values for success. If this is not strongly relevant to the target column, this column should be dropped.
EDIT: During the analysis, it turns out the success cases in this column are highly relevant to target "Yes". We will try to find a way to treat this column
</font>
plt.figure(figsize = (10,5))
sns.countplot(x = "Target", data = bank_df)
print("Yes:", bank_df[bank_df["Target"] == 1]["Target"].count() / bank_df["Target"].count())
print("No:", bank_df[bank_df["Target"] == 0]["Target"].count() / bank_df["Target"].count())
The target column has a low number of Yes values (11%) and will need to be balanced during model development
bank_df.head(1)
bank_df["Target"].unique()
The target column does not have any issues
Displaying the dataset rows for further Bivariate analysis..
bank_df.head()
sns.boxplot(x = "Target", y = "age", data = bank_df)
Age seems to have a very low effect on the target outcome. The median is similar for both 0 and 1 outcomes
plt.figure(figsize = (30,5))
sns.boxplot(x = "balance", y = "Target", orient = "h", data = bank_df)
plt.figure(figsize = (30,5))
sns.boxplot(x = "balance", y = "Target", orient = "h", data = bank_df[bank_df["balance"]<40000])
people who make term deposits seem to have a higher median balance in their accounts
plt.figure(figsize = (20,10))
sns.boxplot(x = "balance", y = "education", hue = "Target", data = bank_df)
People with secondary education that opt for term deposit have similar median balances to people who have secondary education but dont make term deposits. For all other groups the median account balance is higher if they opt for term deposits
plt.figure(figsize = (20,20))
sns.boxplot(x = "balance", y = "job", hue = "Target", data = bank_df)
People in admin, services and student roles who have term deposits/dont have term deposits have similar median account balances, for all other groups the median account balances are higher if the person chooses a term deposit
plt.figure(figsize = (20,5))
sns.boxplot(x = "balance", y = "marital",hue = "Target", data = bank_df)
plt.figure(figsize = (20,5))
sns.boxplot(x = "balance", y = "marital",hue = "Target", data = bank_df[bank_df["balance"]<40000])
Across all groups those who opt for term balances have higher median account balance.
plt.figure(figsize = (20,5))
sns.boxplot(x = "balance", y = "housing", hue = "Target",data = bank_df)
plt.figure(figsize = (20,5))
sns.boxplot(x = "balance", y = "housing", hue = "Target",data = bank_df[bank_df["balance"]<40000])
Across both groups, people with term deposits have a higher median account balance. Also, those without a home loan have a higher balance than the people with home loans.
plt.figure(figsize = (20,5))
sns.boxplot(x = "balance", y = "loan",hue = "Target", data = bank_df[bank_df["balance"]<40000])
## reducing the major outliers from visualization by using <40000 balance check
Regardless of whether a person has a personal loan or not, if he/she makes a term deposit, his median account balance would generally be higher than the people who dont. People without personal loans have a higher median account balance than those who do
plt.figure(figsize = (20,5))
sns.boxplot(x = "balance", y = "contact", hue = "Target", data = bank_df[bank_df["balance"]<40000])
## reducing the major outliers from visualization by using <40000 balance check
telephone holders have the highest median account balance in the categories and those that make a term deposit have a higher median balance than those who dont
plt.figure(figsize = (20,10))
sns.boxplot(x = "balance", y = "month", hue = "Target", data = bank_df[bank_df["balance"]<40000])
The median balance is higher in almost all cases where the term deposit was yes. except the month of march. Put in perspective with the low number of contacts in may, this may mean that lower contacts in march lead to lower term deposits, and the outcome may be affected due to lower base
plt.figure(figsize = (20, 5))
sns.boxplot(x = "balance", y = "poutcome", hue = "Target", data = bank_df[bank_df["balance"]<40000])
## reducing the major outliers from visualization by using <40000 balance check
The category "success" has a similar median balance for both term deposit yes and no. Success may not depend on account balance
plt.figure(figsize = (20, 5))
sns.stripplot(x = "balance", y = "poutcome", hue = "Target", data = bank_df[bank_df["balance"]<40000], jitter = True)
## reducing the major outliers from visualization by using <40000 balance check
The strip plot confirms our previous observation. A lot of success category relates to "Target" yes and it does not necessarily need a large account balance
plt.figure(figsize = (20,10))
sns.boxplot(x = "age", y = "job", hue = "Target", data = bank_df)
Depending on employment, the median age differs for people who have made a term deposit and for those who havent. This should be useful for our machine learning models
plt.figure(figsize = (20,5))
sns.boxplot(x = "age", y = "marital", hue = "Target", data = bank_df)
Single people make term deposits when they are younger, married and divorced people have a higher median age if they make term deposits
plt.figure(figsize = (20,5))
sns.boxplot(x = "age", y = "education", hue = "Target", data = bank_df)
As education levels rise, the median age for making term deposits drops
plt.figure(figsize = (20,5))
sns.boxplot(x = "age", y = "default", hue = "Target", data = bank_df)
people who default, generally make term deposits at a higher median age. It is the other way around for people who have no defaults
plt.figure(figsize = (20,5))
sns.boxplot(x = "age", y = "housing", hue = "Target", data = bank_df)
People have a lower median age for making deposits for housing loans "yes" and "no"
plt.figure(figsize = (20,5))
sns.boxplot(x = "age", y = "loan", hue = "Target", data = bank_df)
People have a lower median age for making deposits, for personal loans "yes" and "no"
plt.figure(figsize = (20,5))
sns.boxplot(x = "age", y = "contact", hue = "Target", data = bank_df)
People with telephones who make deposits usually have a higher median age than others. For people who use cellulars, the median age for making deposits is lower
plt.figure(figsize = (20,10))
sns.boxplot(x = "age", y = "month", hue = "Target", data = bank_df)
when we look at this graph in correlation with the count plot of the contacts against months, we see that the months with lower contacts correspond to a higher median age of customers who make term deposits. For months with higher number of contacts, the median age of people who made term deposits is lower
There is a need to investigate this further. We will try a strip plot and success rate measurement for each month to see if there is a value in keeping the months around</font>
plt.figure(figsize = (20,10))
sns.stripplot(x = "age", y = "month", hue = "Target", data = bank_df, jitter = True)
Though a little difficult to interpret, the graph indicates that a lot of target "yes" were contacted in february, march and april
month_totals = bank_df["month"].value_counts()
month_success = bank_df[bank_df["Target"] == 1]["month"].value_counts()
monthly_series = (month_success/month_totals)*100
monthly_series
This analysis makes it much clearer. Last contacts in months of low contact volume have higher success rates than other months. Contact months of sept, oct, december and march correlate to a higher percentage of target "Yes. Thus we will retain the "month"
plt.figure(figsize = (20,5))
sns.boxplot(x = "age", y = "poutcome", hue = "Target", data = bank_df)
for poutcome "success", the median age for term deposits is higher.
plt.figure(figsize = (20,10))
sns.boxplot(x = "day", y = "job", hue = "Target", data = bank_df)
Apart from housemaids, students, retired people, the median contact days for all other groups where term deposits were made, are less than the cases where term deposits were not made.
plt.figure(figsize = (20,5))
sns.boxplot(x = "day", y = "education", hue = "Target", data = bank_df)
Higher educated groups have a lower contact day median than others. However, the median is 15 in many cases and seems to offer no predictive power
plt.figure(figsize = (20,5))
sns.boxplot(x = "day", y = "default", hue = "Target", data = bank_df)
Median days for those who had term deposits is lower
plt.figure(figsize = (20,5))
sns.boxplot(x = "day", y = "housing", hue = "Target", data = bank_df)
Median days for those who had term deposits is lower, for both, housing loan yes and no
plt.figure(figsize = (20,5))
sns.boxplot(x = "day", y = "loan", hue = "Target", data = bank_df)
Median days for those who had term deposits is lower, for both, personal loan yes and no
plt.figure(figsize = (20,10))
sns.boxplot(x = "day", y = "month", hue = "Target", data = bank_df)
No clear inferrences from this graph. Although it may relate to the countplot of contacts in a month
plt.figure(figsize = (20,5))
sns.boxplot(x = "day", y = "poutcome", hue = "Target", data = bank_df)
The median contact day for success and failure is approximately the same. There is also a large number of unknowns for which the contact day median is lower for term deposits "yes"
plt.figure(figsize = (20,15))
sns.boxplot(x = "duration", y = "job", hue = "Target", data = bank_df)
The duration is generally higher for target "yes". Seems to be a case of more engaged customers
plt.figure(figsize = (20,5))
sns.boxplot(x = "duration", y = "marital", hue = "Target", data = bank_df)
same as above. Duration for contact is higher for target = "yes"
plt.figure(figsize = (20,5))
sns.boxplot(x = "duration", y = "education", hue = "Target", data = bank_df)
As the level of education increases, the duration of the call for term deposits "yes", goes down
plt.figure(figsize = (20,5))
sns.boxplot(x = "duration", y = "default", hue = "Target", data = bank_df)
People with defaults who make term deposits talk more than people who dont have a default and make term deposits
plt.figure(figsize = (20,5))
sns.boxplot(x = "duration", y = "housing", hue = "Target", data = bank_df)
People with loans who have term deposits tend to talk more than people who dont have a loan and have term deposits
plt.figure(figsize = (20,5))
sns.boxplot(x = "duration", y = "loan", hue = "Target", data = bank_df)
People with loans who have term deposits tend to talk more than people who dont have a loan and have term deposits
plt.figure(figsize = (20,5))
sns.boxplot(x = "duration", y = "contact", hue = "Target", data = bank_df)
The median duration for unknown is quite a bit more than cellular and telephones. This seems to be a different distribution altogether
plt.figure(figsize = (20,15))
sns.boxplot(x = "duration", y = "month", hue = "Target", data = bank_df)
The median call for target "1" or "yes" has higher duration in months where there are more contacts.
</font>
plt.figure(figsize = (20,5))
sns.boxplot(x = "duration", y = "poutcome", hue = "Target", data = bank_df)
The median duration for poutcome "sucess" was lower.
plt.figure(figsize = (20,15))
sns.boxplot(x = "campaign", y = "job", hue = "Target", data = bank_df)
This element seems to offer no clear information through a box plot. We are switching to a stripplot to see if the situation improves
plt.figure(figsize = (20,15))
sns.stripplot(x = "campaign", y = "job", hue = "Target", jitter=True, data = bank_df)
Lower number of campaign connects seem to correlate with target "Yes". It could be that the people who want to say yes, say so even with limited contacts.
plt.figure(figsize = (20,5))
sns.stripplot(x = "campaign", y = "marital", hue = "Target", data = bank_df, jitter =True)
Same as above
plt.figure(figsize = (20,5))
sns.stripplot(x = "campaign", y = "education", hue = "Target", data = bank_df, jitter = True)
As the education level rises, there are more contacts for customers with target as "yes". Highly educated people need a larger number of contacts
plt.figure(figsize = (20,5))
sns.stripplot(x = "campaign", y = "default", hue = "Target", data = bank_df, jitter = True)
Very few defaulters make term deposits
plt.figure(figsize = (20,5))
sns.stripplot(x = "campaign", y = "housing", hue = "Target", data = bank_df, jitter = True)
People with no housing loans are more likely to make term deposits, although some of them need more contacts than for those with housing loans who choose to make term deposits
plt.figure(figsize = (20,5))
sns.stripplot(x = "campaign", y = "loan", hue = "Target", data = bank_df, jitter = True)
People without personal loans are far more likely to make term deposits
plt.figure(figsize = (20,5))
sns.stripplot(x = "campaign", y = "contact", hue = "Target", data = bank_df, jitter = True)
People with Cellulars are more likely to make term deposits. Considering that unknowns make even lower deposits than those with telephone, it makes sense to order these three as unknown = -1, telephone = 1, cellular = 2
plt.figure(figsize = (20,15))
sns.stripplot(x = "campaign", y = "month", hue = "Target", data = bank_df, jitter = True)
No additional Observations for months and days
plt.figure(figsize = (20,5))
sns.stripplot(x = "campaign", y = "poutcome", hue = "Target", data = bank_df, jitter = True)
Success closely correlates with target "yes". However, unknowns also have a fair degree of target"yes". This means treatment for unknowns is required
plt.figure(figsize = (20,15))
sns.boxplot(x = "pdays", y = "job", hue = "Target", data = bank_df)
Pdays does not seem to offer clearly visualizable information and we will not proceed on this analysis
plt.figure(figsize = (20,15))
sns.boxplot(x = "previous", y = "job", hue = "Target", data = bank_df)
"Previous" does not seem to offer clearly visualizable information and we will not proceed on this analysis
bank_df.shape
The data has 45211 rows and 17 columns
sns.pairplot(bank_df)
bank_df.info()
The data types look correct, The numeric types are appropriate, and the remaining will be transformed during data cleaning.
bank_df.isnull().sum()
The nulls in this data set are 0. No adjustments required here
print("Age:", bank_df["age"].min(), ",", bank_df["age"].max())
print("Balance:", bank_df["balance"].min(), ",", bank_df["balance"].max())
print("Day:", bank_df["day"].min(), ",", bank_df["day"].max())
print("Duration:", bank_df["duration"].min(), ",", bank_df["duration"].max())
print("Campaign:", bank_df["campaign"].min(), ",", bank_df["campaign"].max())
print("Pdays:", bank_df["pdays"].min(), ",", bank_df["pdays"].max())
print("Previous:", bank_df["previous"].min(), ",", bank_df["previous"].max())
The numerical data looks in the right ranges and there seem to be no out of bound values
bank_df[bank_df == np.nan].count()
There is no Nan value in the dataset either
np.percentile(bank_df["age"],[0,25,50,75,100])
sns.boxplot(bank_df["age"])
Age median is 39, and the 25%,75% are 33, 48. which indicates a large number of people within this small age range
np.percentile(bank_df["balance"],[0,25,50,75,100])
plt.figure(figsize =(20,5))
sns.boxplot(bank_df["balance"])
Balance median is 448, and the 25%,75% are 72, 1428.
np.percentile(bank_df["day"],[0,25,50,75,100])
sns.boxplot(bank_df["day"])
Day median is 16 and the rest of the attributes are aligned with a 31 day month.
np.percentile(bank_df["campaign"],[0,25,50,75,100])
plt.figure(figsize = (20,4))
sns.boxplot(bank_df["campaign"])
Campaign median contacts are 2, and the 25%,75% are 1,3. As investigated previously, this distribution follows a long right tail and has a significant number of rows(>1000) that we have chosen not to drop
np.percentile(bank_df["duration"],[0,25,50,75,100])
plt.figure(figsize = (20,4))
sns.boxplot(bank_df["duration"])
Duration median is 180. The average call lasts 6 minutes. However there are several that last much more and might need to be curtailed depending on further anlaysis (time should be spent on customers most likely to make the deposit)
np.percentile(bank_df["pdays"],[0,25,50,75,100])
plt.figure(figsize = (20,4))
sns.boxplot(bank_df["pdays"])
The Pday median is -1 as there is an overwhelming number of customers who have not been contacted. In such a case, it makes sense to treat this column. As the number of pdays increases, the probability of term deposit goes down, so it makes sense that these -1s should be converted to 999
np.percentile(bank_df["previous"],[0,25,50,75,100])
plt.figure(figsize = (20,3))
sns.boxplot(bank_df["previous"])
There is one clear outlier, that is well over 250. This row will be dropped or set to the average of the others.
z_critical = 3
age_z = stat.zscore(bank_df["age"])
print("Median",bank_df["age"].median())
print("Z max", age_z.max())
bank_df[age_z > z_critical]
Despite some outliers on the higher side, we will continue with this column as is. This is because the outliers seem to be in a reasonable range as proven in 5 point summaries
z_critical = 3
balance_z = stat.zscore(bank_df["balance"])
print("Median",bank_df["balance"].median())
print("Z max", balance_z.max())
bank_df[balance_z > z_critical]
Account balances seem to be well outside the acceptable Z score ranges. However, we have seen that there are quite a few occurences of balances in an outlier region. We will accept this as the nature of the distribution. The column will be scaled through min max scaler
z_critical = 3
duration_z = stat.zscore(bank_df["duration"])
print("Median",bank_df["duration"].median())
print("Z max", duration_z.max())
bank_df[duration_z > z_critical]
The duration has a large right skew due to outliers. However, we see that there are several calls that are above the Z score "3" range. Thus we will retain this column as is
z_critical = 3
campaign_z = stat.zscore(bank_df["campaign"])
print("Median",bank_df["campaign"].median())
print("Z max", campaign_z.max())
print(bank_df[bank_df["campaign"] > 20]["campaign"].count())
print(bank_df[bank_df["campaign"] > 40]["campaign"].count())
bank_df[campaign_z > z_critical]
Even the campaign column has information which follows a set distribution. While the outliers are statistically quite far away, but they tend to follow the distribution of the column. We will thus retain them
z_critical = 3
pdays_z = stat.zscore(bank_df["pdays"])
print("Median",bank_df["pdays"].median())
print("Z max", pdays_z.max())
bank_df[pdays_z > z_critical]
The pdays data seems to be correct. There is a signficant portion of pdays which seem like outliers. This is primarily due to the large number of -1s in the distribution. This will change if we replace the -1s with 999
z_critical = 3
previous_z = stat.zscore(bank_df["previous"])
print("Median",bank_df["previous"].median())
print("Z max", previous_z.max())
bank_df[previous_z > z_critical]
The previous column has 1 major outlier, that is the value 275. This will be replaced by an average in data cleaning
Viewing the data
</font>
bank_df.head()
bank_df.info()
bank_df.describe()
we replace the outliers in column "previous". there is only 1(275) that seems like an error. Also we replace the -1 in pdays with 999 </font>
def update_previous(previous):
if previous == 275:
return bank_df[bank_df["previous"] < 275]["previous"].mean()
else:
return previous
bank_df["previous"] = bank_df["previous"].apply(update_previous)
def update_pdays(pdays):
if pdays == -1:
return 999
else:
return pdays
bank_df["pdays"] = bank_df["pdays"].apply(update_pdays)
We look at the columns that have data types unknown and determine appropriate actions for them </font>
bank_df[bank_df["Target"] == 1]["education"].value_counts() / bank_df["education"].value_counts()
The unknowns in education seem to be somewhere between tertiary and secondary in terms of the target variable relevance. Thus we will use the grading - primary:0, secondary:1, unknown:2, tertiary:3
bank_df[bank_df["Target"] == 1]["contact"].value_counts() / bank_df["contact"].value_counts()
The unknowns in contact seem to be much below cellular and telephone in terms of the target variable relevance. Thus we will use the grading - "unknown" : -1, "telephone": 1, "cellular": 2
bank_df[bank_df["Target"] == 1]["poutcome"].value_counts() / bank_df["poutcome"].value_counts()
The unknowns in poutcome seem to be much below the other categories in terms of the target variable relevance. Thus we will use the grading - "unknown" : -1, "failure": 1, "other": 2, "success": 3
we replace the education, contact and poutcome columns with the sequence values determined during our analysis </font>
replace_struc = {
"education": { "tertiary": 3, "secondary" : 1, "primary":0, "unknown":2},
"contact" : {"unknown" : -1, "telephone": 1, "cellular": 2},
"poutcome": {"unknown" : -1, "failure": 1, "other": 2, "success": 3}
}
oneHotCols = ["job", "marital", "default", "housing", "loan", "month"]
bank_df = pd.get_dummies(bank_df, columns = oneHotCols, drop_first=True).replace(replace_struc)
bank_df = bank_df.drop (["day"], axis = 1)
bank_df.head()
The numerical columns with large values are scaled and brought down into a range that converges faster </font>
scaler = MinMaxScaler()
transformed_columns = scaler.fit_transform(bank_df[["age", "education", "balance", "contact", "campaign", "pdays", "previous", "poutcome"]])
transformed_df = pd.DataFrame(transformed_columns, columns = ["age", "education", "balance", "contact", "campaign", "pdays", "previous", "poutcome"])
bank_df = pd.concat([transformed_df, bank_df.drop(["age", "education", "balance", "contact", "campaign", "pdays", "previous", "poutcome", "duration"], axis = 1)], axis = 1)
bank_df.head()
We now have 36 features in our dataset
</font>
y = bank_df.pop("Target")
x = bank_df
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 101)
bank_df.head()
</font>
logistic_regressor = LogisticRegression(solver = "liblinear")
logistic_regressor.fit (x_train, y_train)
y_pred_logistic = logistic_regressor.predict (x_test)
print(classification_report (y_test, y_pred_logistic))
print(confusion_matrix(y_test, y_pred_logistic))
cm = confusion_matrix(y_test, y_pred_logistic, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
logistic_regressor.score(x_test, y_test)
Although overall, the logistic regression scores well (89%), the imbalance of the data causes it to perform poorly on recall for cases where target = "1"
</font>
fpr_logistic, tpr_logistic, thresholds_logistic = roc_curve(y_test, y_pred_logistic)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_logistic,tpr_logistic,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
</font>
y_pred_logistic_prob = logistic_regressor.predict_proba(x_test)
y_pred_logistic_prob = y_pred_logistic_prob[:,1]
fpr_logistic_prob, tpr_logistic_prob, thresholds_logistic_prob = roc_curve(y_test, y_pred_logistic_prob)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_logistic_prob,tpr_logistic_prob,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
</font>
#developing an AUC Scorer to use with grid search
#The idea is to help maximize the auc values in the outcome
def auc_val(y_test, y_predicted):
fpr_custom, tpr_custom, _ = roc_curve(y_test, y_predicted, pos_label=1)
return (auc(fpr_custom, tpr_custom))
custom_scorer = make_scorer(auc_val, greater_is_better=True, needs_proba=True)
</font>
#developing an AUC Scorer to use with grid search
#The idea is to help maximize the auc values in the outcome
def auc_val_ada(y_test, y_predicted):
fpr_custom, tpr_custom, _ = roc_curve(y_test, y_predicted, pos_label=1)
score_df = pd.DataFrame(columns = ["test", "predicted"])
score_df["test"] = y_test
score_df["predicted"] = y_predicted
penalty = np.mean(score_df [score_df["test"]==1]["test"] != score_df [score_df["test"]==1]["predicted"])
return (auc(fpr_custom, tpr_custom) - penalty) ##Applies penalty term. Will likely return negative value
custom_scorer_ada = make_scorer(auc_val_ada, greater_is_better=True, needs_proba=True)
</font>
knn_classifier = KNeighborsClassifier(n_neighbors = 2, metric = "euclidean", weights = "distance")
knn_classifier.fit(x_train, y_train)
y_pred_knn = knn_classifier.predict(x_test)
print(classification_report(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))
cm = confusion_matrix(y_test, y_pred_knn, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
The base KNN does slightly better than the logistic regression in terms of recall on Target = "1" cases. We will attempt to improve it through optimization based on an elbow plot
</font>
error_rate = []
for i in range(3,20):
knn_classifier = KNeighborsClassifier(n_neighbors = i, metric = "euclidean", weights = "distance")
knn_classifier.fit(x_train, y_train)
y_pred_knn = knn_classifier.predict(x_test)
error_rate.append(np.mean(y_pred_knn != y_test))
plt.figure(figsize = (10,6))
plt.plot (range(3,20), error_rate, color = 'blue', ls = '--', marker = 'v', markerfacecolor = "red")
plt.title ("Elbow plot")
plt.xlabel("K value")
plt.ylabel("Mean Error")
elbows emerge at k = 9, k = 11. We will choose k = 11 for further analysis </font>
</font>
knn_classifier = KNeighborsClassifier(n_neighbors = 11, metric = "euclidean", weights = "distance")
knn_classifier.fit(x_train, y_train)
y_pred_knn = knn_classifier.predict(x_test)
print(classification_report(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))
print(knn_classifier.score(x_train, y_train))
print(knn_classifier.score(x_test, y_test))
cm = confusion_matrix(y_test, y_pred_knn, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
The optimal KNN scores better overall as compared to the logistic regression but continues to perform poorly in recall for target "1" cases
</font>
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, y_pred_knn)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_knn,tpr_knn,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
</font>
y_pred_knn_prob = knn_classifier.predict_proba(x_test)
y_pred_knn_prob = y_pred_knn_prob[:,1]
fpr_knn_prob, tpr_knn_prob, thresholds_knn_prob = roc_curve(y_test, y_pred_knn_prob)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_knn_prob,tpr_knn_prob,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
</font>
d_tree = DecisionTreeClassifier(criterion = "gini")
d_tree.fit(x_train, y_train)
d_tree.score(x_train, y_train)
d_tree.score(x_test, y_test)
An unparameterized decision tree seems to overfit the training data and score lower on the test
</font>
d_tree = DecisionTreeClassifier(criterion = "gini", max_depth = 7)
d_tree.fit(x_train, y_train)
print(d_tree.score(x_train, y_train))
print(d_tree.score(x_test, y_test))
y_pred_dtree = d_tree.predict(x_test)
print(classification_report(y_test, y_pred_dtree))
cm = confusion_matrix(y_test, y_pred_dtree, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
</font>
fpr_dtree, tpr_dtree, threshold_dtree = roc_curve(y_test, y_pred_dtree)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_dtree,tpr_dtree,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
</font>
y_pred_dtree_prob = d_tree.predict_proba(x_test)
y_pred_dtree_prob = y_pred_dtree_prob[:,1]
fpr_dtree_prob, tpr_dtree_prob, threshold_dtree_prob = roc_curve(y_test, y_pred_dtree)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_dtree_prob,tpr_dtree_prob,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
</font>
param_grid = {"max_depth": [7, 8, 9],
"class_weight": ["balanced"], # balancing the weights to account for lesser ones in the dataset
"min_samples_split": [5, 6, 7, 8, 11],
"min_samples_leaf": [10, 12, 13, 14],
"max_features" : [30, 31, 32, 35]}
clf = GridSearchCV(d_tree, param_grid = param_grid, scoring = custom_scorer, cv = 3, verbose=True, n_jobs=-1)
d_tree_clf = clf.fit(x_train, y_train)
print(d_tree_clf.score(x_train, y_train))
print(d_tree_clf.score(x_test, y_test))
print(d_tree_clf.best_params_)
</font>
d_tree_clf = DecisionTreeClassifier(criterion = "gini", class_weight = d_tree_clf.best_params_["class_weight"], max_depth = d_tree_clf.best_params_["max_depth"], max_features = d_tree_clf.best_params_["max_features"], min_samples_leaf = d_tree_clf.best_params_["min_samples_leaf"], min_samples_split = d_tree_clf.best_params_["min_samples_split"])
d_tree_clf.fit(x_train, y_train)
print(d_tree_clf.score(x_train, y_train))
print(d_tree_clf.score(x_test, y_test))
y_pred_dtree_clf = d_tree_clf.predict(x_test)
y_pred_dtree_clf_prob = d_tree_clf.predict_proba(x_test)
y_pred_dtree_clf_prob = y_pred_dtree_clf_prob[:,1]
print(classification_report(y_test, y_pred_dtree_clf))
cm = confusion_matrix(y_test, y_pred_dtree_clf, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
After multiple iterations (we went from broad parameter ranges to granular ranges), we were able to reach the best parameters for the decision tree to maximise recall
</font>
fpr_dtree_clf, tpr_dtree_clf, threshold_dtree_clf = roc_curve(y_test, y_pred_dtree_clf)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_dtree_clf,tpr_dtree_clf,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
</font>
fpr_dtree_clf_prob, tpr_dtree_clf_prob, threshold_dtree_clf_prob = roc_curve(y_test, y_pred_dtree_clf_prob)
plt.plot([0,1],[0,1],"r--", label = "line")
plt.plot(fpr_dtree_clf_prob,tpr_dtree_clf_prob,"b--", label = "RoC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
print("Basic Decision Tree: \n", classification_report(y_test, y_pred_dtree))
print("Optimal Decision Tree: \n",classification_report(y_test, y_pred_dtree_clf))
print("Basic Decision Tree: \n", confusion_matrix(y_test, y_pred_dtree))
cm = confusion_matrix(y_test, y_pred_dtree, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
print("Optimal Decision Tree: \n",confusion_matrix(y_test, y_pred_dtree_clf))
cm = confusion_matrix(y_test, y_pred_dtree_clf, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
The optimal decision tree performs much better in terms of recall on the cases with Target = 1.
</font>
d_tree_bagger = DecisionTreeClassifier(criterion = "gini", max_depth = 13, max_features = 15, class_weight="balanced")
bag_ = BaggingClassifier(base_estimator=d_tree_bagger, n_estimators = 300)
param_grid = {
"base_estimator__max_depth" : [ 7, 8, 9, 10, 11],
"base_estimator__max_features" : [3, 4, 5, 6, 7]
}
clf = GridSearchCV(bag_, param_grid = param_grid, scoring = custom_scorer, cv = 3, verbose=True, n_jobs=-1)
bag_ = clf.fit(x_train, y_train)
print(bag_.score(x_train, y_train))
print(bag_.score(x_test, y_test))
print(bag_.best_params_)
y_pred_bag = bag_.predict(x_test)
print("Bagging: \n", classification_report(y_test, y_pred_bag))
</font>
d_tree_bagger = DecisionTreeClassifier(criterion = "gini", max_depth = bag_.best_params_["base_estimator__max_depth"], max_features = bag_.best_params_["base_estimator__max_features"], class_weight="balanced")
bag_ = BaggingClassifier(base_estimator=d_tree_bagger, n_estimators = 300)
param_grid = {
"base_estimator__max_depth" : [ 7, 8, 9, 10, 11],
"base_estimator__max_features" : [3, 4, 5, 6, 7]
}
bag_.fit(x_train, y_train)
print(bag_.score(x_train, y_train))
print(bag_.score(x_test, y_test))
y_pred_bag = bag_.predict(x_test)
print("Bagging: \n", classification_report(y_test, y_pred_bag))
print("Bagging: \n", confusion_matrix(y_test, y_pred_bag))
cm = confusion_matrix(y_test, y_pred_bag, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
Bagging performs better on predicting and recalling Target "1"s
</font>
d_tree_ada = DecisionTreeClassifier(criterion = "gini", max_depth = 4, max_features = 5, class_weight="balanced")
ada_ = AdaBoostClassifier(base_estimator=d_tree_ada, n_estimators = 300)
param_grid = {
"base_estimator__max_depth" : [1, 2, 3],
"base_estimator__max_features" : [16, 19, 22]
}
clf = GridSearchCV(ada_, param_grid = param_grid, scoring = custom_scorer_ada, cv = 3, verbose=True, n_jobs=-1)
ada_ = clf.fit(x_train, y_train)
print(ada_.score(x_train, y_train))
print(ada_.score(x_test, y_test))
print(ada_.best_params_)
y_pred_ada = ada_.predict(x_test)
print("Adaptive Boosting: \n", classification_report(y_test, y_pred_ada))
The scoring on the test and train sets is negative as we have added a performance penalty to this scorer (custom_scorer_ada). This was done specifically to ensure that the adaptive boosting algorithm tried to improved recall. This was a problem faced in the earlier iterations of the ada boost algorithm for this assignment
</font>
d_tree_ada = DecisionTreeClassifier(criterion = "gini", max_depth = ada_.best_params_["base_estimator__max_depth"], max_features = ada_.best_params_["base_estimator__max_features"], class_weight="balanced")
ada_ = AdaBoostClassifier(base_estimator=d_tree_ada, n_estimators = 300)
param_grid = {
"base_estimator__max_depth" : [1, 2, 3],
"base_estimator__max_features" : [16, 19, 22]
}
clf = GridSearchCV(ada_, param_grid = param_grid, scoring = custom_scorer, cv = 3, verbose=True, n_jobs=-1)
ada_.fit(x_train, y_train)
print(ada_.score(x_train, y_train))
print(ada_.score(x_test, y_test))
y_pred_ada = ada_.predict(x_test)
print("Adaptive Boosting: \n", classification_report(y_test, y_pred_ada))
print("Adaptive Boosting: \n", confusion_matrix(y_test, y_pred_ada))
cm = confusion_matrix(y_test, y_pred_ada, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
Through our custom scorer, we were able to bring up the Ada boost to a higher degree of recall for target cases = 1
</font>
grb_ = GradientBoostingClassifier(n_estimators = 300, learning_rate = 0.03, max_depth = 5, max_features = 15)
param_grid = {
'max_depth' : [3, 4, 5, 6, 7],
'max_features' : [5, 7, 9],
'learning_rate' : [0.03]
}
clf = GridSearchCV(grb_, param_grid = param_grid, scoring = custom_scorer, cv = 3, verbose=True, n_jobs=-1)
grb_ = clf.fit(x_train, y_train)
print(grb_.score(x_train, y_train))
print(grb_.score(x_test, y_test))
print(grb_.best_params_)
y_pred_grb = grb_.predict(x_test)
print("Gradient Boosting: \n", classification_report(y_test, y_pred_grb))
</font>
grb_ = GradientBoostingClassifier(n_estimators = 300, learning_rate = 0.03, max_depth = grb_.best_params_["max_depth"], max_features = grb_.best_params_["max_features"])
grb_.fit(x_train, y_train)
print(grb_.score(x_train, y_train))
print(grb_.score(x_test, y_test))
y_pred_grb = grb_.predict(x_test)
print("Gradient Boosting: \n", classification_report(y_test, y_pred_grb))
print("Gradient Boosting: \n", confusion_matrix(y_test, y_pred_grb))
cm = confusion_matrix(y_test, y_pred_grb, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
Gradient boosting performs better than other algorithms overall, but continues to face the recall issues that are prevalent. Primarily due to lack of sufficient 1s in the target column. We could use oversampling algorithms such as SMOTE on the training set, however, since the libraries may not be installed at your machines, this has not been attempted
</font>
random_forest = RandomForestClassifier(max_depth = 15, max_features = 25, class_weight="balanced", n_estimators = 300)
param_grid = {
'max_depth' : [ 9, 10, 12],
'max_features' : [3, 4, 5, 6, 7]
}
clf = GridSearchCV(random_forest, param_grid = param_grid, scoring = custom_scorer, cv = 3, verbose=True, n_jobs=-1)
random_forest = clf.fit(x_train, y_train)
print(random_forest.score(x_train, y_train))
print(random_forest.score(x_test, y_test))
print(random_forest.best_params_)
y_pred_random_forest = random_forest.predict(x_test)
print("Random Forest: \n", classification_report(y_test, y_pred_random_forest))
</font>
random_forest = RandomForestClassifier(max_depth = random_forest.best_params_["max_depth"], max_features = random_forest.best_params_["max_features"], class_weight="balanced", n_estimators = 300)
random_forest.fit(x_train, y_train)
print(random_forest.score(x_train, y_train))
print(random_forest.score(x_test, y_test))
y_pred_random_forest = random_forest.predict(x_test)
print("Random Forest: \n", classification_report(y_test, y_pred_random_forest))
print("Random forest: \n", confusion_matrix(y_test, y_pred_random_forest))
cm = confusion_matrix(y_test, y_pred_random_forest, labels = [0, 1])
plt.figure (figsize = (6,4))
sns.heatmap (cm, annot = True, cmap = "coolwarm")
The random forest performs significantly better in recall for Target "1"s, and has an overall accuracy score of around 79% on the test data. This may be improved further through oversampling the minority class in the test data
</font>
fpr_bag, tpr_bag, threshold_bag = roc_curve(y_test, y_pred_bag)
fpr_ada, tpr_ada, threshold_ada = roc_curve(y_test, y_pred_ada)
fpr_grb, tpr_grb, threshold_grb = roc_curve(y_test, y_pred_grb)
fpr_random, tpr_random, threshold_random = roc_curve(y_test, y_pred_random_forest)
plt.figure(figsize = (12,8))
plt.plot([0,1],[0,1],"r--", label = "Baseline")
plt.plot(fpr_bag,tpr_bag,"b--", label = "Bagging")
plt.plot(fpr_ada,tpr_ada,"m--", label = "AdaBoost")
plt.plot(fpr_grb,tpr_grb,"g--", label = "GradientBoosting")
plt.plot(fpr_random,tpr_random,"y--", label = "Random Forest")
plt.plot(fpr_logistic,tpr_logistic,"b-", label = "Logistic")
plt.plot(fpr_knn,tpr_knn,"m-", label = "Optimal KNN")
plt.plot(fpr_dtree_clf,tpr_dtree_clf,"g-", label = "Optimal Decision Tree")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve Comparison")
plt.legend()
Bagging, random forests, ada boost and decision trees perform much better in this graph, primarily due to the fact that these are predictions from their optimized versions and they are able to produce better recall on Target = 1 cases
</font>
y_pred_bag_prob = bag_.predict_proba(x_test)
y_pred_ada_prob = ada_.predict_proba(x_test)
y_pred_grb_prob = grb_.predict_proba(x_test)
y_pred_random_forest_prob = random_forest.predict_proba(x_test)
y_pred_bag_prob = y_pred_bag_prob[:,1]
y_pred_ada_prob = y_pred_ada_prob[:,1]
y_pred_grb_prob = y_pred_grb_prob[:,1]
y_pred_random_forest_prob = y_pred_random_forest_prob[:,1]
fpr_bag_prob, tpr_bag_prob, threshold_bag_prob = roc_curve(y_test, y_pred_bag_prob)
fpr_ada_prob, tpr_ada_prob, threshold_ada_prob = roc_curve(y_test, y_pred_ada_prob)
fpr_grb_prob, tpr_grb_prob, threshold_grb_prob = roc_curve(y_test, y_pred_grb_prob)
fpr_random_prob, tpr_random_prob, threshold_random_prob = roc_curve(y_test, y_pred_random_forest_prob)
plt.figure(figsize = (17,13))
plt.plot([0,1],[0,1],"r--", label = "Baseline")
plt.plot(fpr_bag_prob,tpr_bag_prob,"b--", label = "Bagging")
plt.plot(fpr_ada_prob,tpr_ada_prob,"m--", label = "AdaBoost")
plt.plot(fpr_grb_prob,tpr_grb_prob,"g--", label = "GradientBoosting")
plt.plot(fpr_random_prob,tpr_random_prob,"y--", label = "Random Forest")
plt.plot(fpr_logistic_prob,tpr_logistic_prob,"b-", label = "Logistic")
plt.plot(fpr_knn_prob,tpr_knn_prob,"m-", label = "Optimal KNN")
plt.plot(fpr_dtree_clf_prob,tpr_dtree_clf_prob,"g-", label = "Optimal Decision Tree")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve Comparison")
plt.legend()
</font>
print("Gradient Boosting: ", auc(fpr_grb_prob,tpr_grb_prob))
print("Bagging: ", auc(fpr_bag_prob,tpr_bag_prob))
print("Random Forest: ", auc(fpr_random_prob,tpr_random_prob))
print("AdaBoost: ", auc(fpr_ada_prob,tpr_ada_prob))
print("Optimal Decision Tree: ", auc(fpr_dtree_clf_prob,tpr_dtree_clf_prob))
print("Logistic Regression: ", auc(fpr_logistic_prob,tpr_logistic_prob))
print("KNN: ", auc(fpr_knn_prob,tpr_knn_prob))
Gradient boosting offers the strongest performance scores (AUC based). Random forest and Bagging perform quite closely to gradient boosting. Ada boost is a bit further behind. The optimal decision tree is just a bit lower than these options followed by logistic regression and KNN. It does seem that we need to tune the Ada boost a bit more to reach its complete potential
To ensure higher recall for Ada boost, we have used our custom scorer to add a classficiation penalty for the Target = 1 cases. This will create a pseudo weightage on getting the target 1s right and trigger better recall candidates on target 1s to survive </font>